#include "opencl_source_map.hpp" 
namespace MNN { 
#ifndef MNN_OPENCL_BUFFER_CLOSED
const char* groupnorm_buf = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#if LOCAL_SIZE>1\n"
"__kernel void groupnorm_plain_buf(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
"#ifdef DOUBLE_INPUTS\n"
" __global const FLOAT*input0,\n"
" __global const FLOAT*input1,\n"
"#else\n"
" __global const FLOAT*input,\n"
"#endif\n"
" __global FLOAT*output,\n"
" __private const int area,\n"
" __private const int group,\n"
" __private const int inside,\n"
" __private const int outside,\n"
"#ifdef GAMMA_BETA\n"
" __global const FLOAT *gamma,\n"
" __global const FLOAT *beta,\n"
"#endif\n"
" __private float epsilon){\n"
" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
" float local sum_mean_mnn[LOCAL_SIZE];\n"
" float local sum_mnn[LOCAL_SIZE];\n"
" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
" const int idx_out=pos.z;\n"
" const int lid=get_local_id(0);\n"
" const int offset=idx_out*inside;\n"
" const int inside_v4=(inside+3) >> 2;\n"
" \n"
"#ifdef DOUBLE_INPUTS\n"
" // The product of W and H is a multiple of 4\n"
" #ifdef WH_4\n"
" float4 in_sum=0;\n"
" int index=lid;\n"
" for(; index<inside_v4; index+=LOCAL_SIZE){\n"
" float4 in0=convert_float4(vload4(index,input0+offset));\n"
" in_sum += in0;\n"
" float in1=input1[idx_out*(inside/area)+index/(area/4)];\n"
" in_sum += (float4)(in1,in1,in1,in1);\n"
" }\n"
" sum_mean_mnn[lid]=in_sum.x+in_sum.y+in_sum.z+ in_sum.w;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
" if (lid<i)\n"
" sum_mean_mnn[lid]=sum_mean_mnn[lid]+sum_mean_mnn[lid+i];\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" \n"
" float4 mean=sum_mean_mnn[0]/(float4)inside;\n"
" in_sum=0;\n"
" index=lid;\n"
" for(; index<inside_v4; index+=LOCAL_SIZE){\n"
" float4 in0=convert_float4(vload4(index,input0+offset));\n"
" float in1=input1[idx_out*(inside/area)+index/(area/4)];\n"
" in_sum += (in0+(float4)(in1,in1,in1,in1)-mean)*(in0+(float4)in1-mean);\n"
" }\n"
" sum_mnn[lid]=in_sum.x+in_sum.y+in_sum.z+in_sum.w;\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
" if (lid<i)\n"
" sum_mnn[lid]=sum_mnn[lid]+sum_mnn[lid+i];\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" float4 square_sum=(float4)(sum_mnn[0]/inside);\n"
" float4 value=(float4)(1.0f/sqrt(square_sum.x+epsilon));\n"
" for(int i=lid; i<inside_v4; i+=LOCAL_SIZE){\n"
" float4 in0=convert_float4(vload4(i,input0+offset));\n"
" float in1=input1[idx_out*(inside/area)+i/(area/4)];\n"
" float4 out=(in0+(float4)(in1,in1,in1,in1)-mean)*value;\n"
" #ifdef GAMMA_BETA\n"
" int offset_gamma_beta=(idx_out % group)*inside/area+i/(area/4);\n"
" out=out*(float4)((float)gamma[offset_gamma_beta])+(float4)((float)beta[offset_gamma_beta]);\n"
" #endif\n"
" #ifdef SWISH\n"
" out=out*native_recip((float4)1+native_exp(convert_float4(-out)));\n"
" #endif\n"
" vstore4(CONVERT_FLOAT4(out),i,output+offset);\n"
" }\n"
" #else\n"
" \n"
" float in_sum=0;\n"
" int index=lid;\n"
" for(; index<inside; index+=LOCAL_SIZE){\n"
" float in0=input0[offset+index];\n"
" in_sum += in0;\n"
" float in1=input1[idx_out*(inside/area)+index/area];\n"
" in_sum += in1;\n"
" }\n"
" sum_mean_mnn[lid]=in_sum;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
" if (lid<i)\n"
" sum_mean_mnn[lid]=sum_mean_mnn[lid]+sum_mean_mnn[lid+i];\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" float mean=sum_mean_mnn[0]/inside;\n"
" in_sum=0;\n"
" index=lid;\n"
" for(; index<inside; index+=LOCAL_SIZE){\n"
" float in0=input0[offset+index];\n"
" float in1=input1[idx_out*(inside/area)+index/area];\n"
" in_sum += (in0+in1-mean)*(in0+in1-mean);\n"
" }\n"
" sum_mnn[lid]=in_sum;\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
" if (lid<i)\n"
" sum_mnn[lid]=sum_mnn[lid]+sum_mnn[lid+i];\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" float square_sum=sum_mnn[0]/inside;\n"
" float value=1.0f/sqrt(square_sum+epsilon);\n"
" for(int i=lid; i<inside; i+=LOCAL_SIZE){\n"
" float in0=input0[offset+i];\n"
" float in1=input1[idx_out*(inside/area)+i/area];\n"
" float out=(in0+in1-mean)*value;\n"
" #ifdef GAMMA_BETA\n"
" int offset_gamma_beta=(idx_out % group)*inside/area+i/area;\n"
" out=out*(float)gamma[offset_gamma_beta]+(float)beta[offset_gamma_beta];\n"
" #endif\n"
" #ifdef SWISH\n"
" out=out*native_recip(1.0+native_exp(-out));\n"
" #endif\n"
" output[offset+i]=(FLOAT)out;\n"
" }\n"
" \n"
" #endif\n"
"#else\n"
" const int inside_remain=inside-((inside_v4-1) << 2);\n"
" float4 in_sum=0;\n"
" int index=lid;\n"
" for(; index<inside_v4-1; index+=LOCAL_SIZE){\n"
" float4 in=convert_float4(vload4(index,input+offset));\n"
" in_sum += in;\n"
" }\n"
" sum_mean_mnn[lid]=in_sum.x+in_sum.y+in_sum.z+ in_sum.w;\n"
" \n"
" float4 in_left=0;\n"
" if(index == inside_v4-1) {\n"
" in_left=convert_float4(vload4(inside_v4-1,input+offset));\n"
" sum_mean_mnn[lid]=sum_mean_mnn[lid]+in_left.x;\n"
" if(inside_remain>1) {\n"
" sum_mean_mnn[lid]=sum_mean_mnn[lid]+in_left.y;\n"
" }\n"
" if(inside_remain>2) {\n"
" sum_mean_mnn[lid]=sum_mean_mnn[lid]+in_left.z;\n"
" }\n"
" if(inside_remain>3) {\n"
" sum_mean_mnn[lid]=sum_mean_mnn[lid]+in_left.w;\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
" if (lid<i)\n"
" sum_mean_mnn[lid]=sum_mean_mnn[lid]+sum_mean_mnn[lid+i];\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" \n"
" float4 mean=(float4)(sum_mean_mnn[0]/inside);\n"
" in_sum=0;\n"
" index=lid;\n"
" for(; index<inside_v4-1; index+=LOCAL_SIZE){\n"
" float4 in=convert_float4(vload4(index,input+offset));\n"
" in_sum += (in-mean)*(in-mean);\n"
" }\n"
" sum_mnn[lid]=in_sum.x+in_sum.y+in_sum.z+in_sum.w;\n"
" \n"
" if(index == inside_v4-1) {\n"
" float4 in_left=convert_float4(vload4(inside_v4-1,input+offset));\n"
" in_sum=(in_left-mean)*(in_left-mean);\n"
" sum_mnn[lid]=sum_mnn[lid]+in_sum.x;\n"
" if(inside_remain>1) {\n"
" sum_mnn[lid]=sum_mnn[lid]+in_sum.y;\n"
" }\n"
" if(inside_remain>2) {\n"
" sum_mnn[lid]=sum_mnn[lid]+in_sum.z;\n"
" }\n"
" if(inside_remain>3) {\n"
" sum_mnn[lid]=sum_mnn[lid]+in_sum.w;\n"
" }\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
" if (lid<i)\n"
" sum_mnn[lid]=sum_mnn[lid]+sum_mnn[lid+i];\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" float4 square_sum=(float4)(sum_mnn[0]/inside);\n"
" float4 value=(float4)(1.0f/sqrt(square_sum.x+epsilon));\n"
" // The product of W and H is a multiple of 4\n"
" #ifdef WH_4\n"
" for(int i=lid; i<inside_v4; i+=LOCAL_SIZE){\n"
" float4 in=convert_float4(vload4(i,input+offset));\n"
" float4 out=(in-mean)*value;\n"
" #ifdef GAMMA_BETA\n"
" int offset_gamma_beta=(idx_out % group)*inside/area+i/(area/4);\n"
" out=out*(float4)((float)gamma[offset_gamma_beta])+(float4)((float)beta[offset_gamma_beta]);\n"
" #endif\n"
" #ifdef SWISH\n"
" out=out*native_recip((float4)1+native_exp(convert_float4(-out)));\n"
" #endif\n"
" vstore4(CONVERT_FLOAT4(out),i,output+offset);\n"
" }\n"
" #else\n"
" for(int i=lid; i<inside; i+=LOCAL_SIZE){\n"
" float in=input[offset+i];\n"
" float out=(in-mean.x)*value.x;\n"
" #ifdef GAMMA_BETA\n"
" int offset_gamma_beta=(idx_out % group)*inside/area+i/area;\n"
" out=out*(float)gamma[offset_gamma_beta]+(float)beta[offset_gamma_beta];\n"
" #endif\n"
" #ifdef SWISH\n"
" out=out*native_recip(1.0+native_exp(-out));\n"
" #endif\n"
" \n"
" output[offset+i]=(FLOAT)out;\n"
" }\n"
" #endif\n"
"#endif\n"
" }\n"
"}\n"
"#endif\n"
;
#endif
}
